Question 1: Male and Female Graduate Enrollment (1967-2015)
# Exploring the data
enroll_explore <- ggplot(enroll_tidy_df, aes(x = Year, y = Enrollment)) +
geom_point(aes(color = Sex))
enroll_explore
# Female enrollment looks to be increasing at a much faster rate than male enrollment. Male enrollment was initially higher, but in the late 1980's, female enrollment surpassed male enrollement. Both enrollments level out at 2010. Male enrollment goes through stagnant periods and short spurts of increasing.
# Running a linear regression on dependent variable (y = male enrollment) by independent variale (x = year)
enroll_male_lm1 <- lm(`total Males` ~ Year, data = grad_enroll_df)
summary(enroll_male_lm1)
##
## Call:
## lm(formula = `total Males` ~ Year, data = grad_enroll_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96461 -34861 -12841 51876 95766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17112153 1087024 -15.74 <2e-16 ***
## Year 9069 546 16.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54050 on 47 degrees of freedom
## Multiple R-squared: 0.8545, Adjusted R-squared: 0.8514
## F-statistic: 276 on 1 and 47 DF, p-value: < 2.2e-16
plot(enroll_male_lm1)
# Running a linear regression on dependent variable (y = female enrollment) by independent variable (x = year)
enroll_female_lm1 <- lm(`total Females` ~ Year, data = grad_enroll_df)
summary(enroll_female_lm1)
##
## Call:
## lm(formula = `total Females` ~ Year, data = grad_enroll_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89397 -48101 -7633 45267 129727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.896e+07 1.161e+06 -50.77 <2e-16 ***
## Year 3.013e+04 5.832e+02 51.66 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57730 on 47 degrees of freedom
## Multiple R-squared: 0.9827, Adjusted R-squared: 0.9823
## F-statistic: 2669 on 1 and 47 DF, p-value: < 2.2e-16
plot(enroll_female_lm1)
# To show the trends graphically, we will plot a graph over time of female and male enrollment to show the differences in trends.
enroll_tidy_df1 <- enroll_tidy_df %>%
mutate(enrollment = Enrollment/1000000)
grad_enroll_graph <- ggplot(enroll_tidy_df1, aes(x = Year, y = enrollment)) +
geom_line(aes(color = Sex), size = 1) +
theme_linedraw() +
scale_x_continuous(expand = c(0,0)) +
scale_y_continuous(expand = c(0,0), breaks = seq(0, 2, .25), limits = c(0,2)) +
labs(x = "Year", y = "Total Graduate Enrollment (Millions of Students)", title ="Graduate Enrollment in the United States (1967-2015)") + scale_color_brewer(palette = "Pastel1")
grad_enroll_graph
Figure 1. Graduate Enrollment in the United States (1967 - 2015) Total enrollment of graduate students in the United States from 1967 - 2015 (in millions of students). Blue line indicates Male students, red line indicates Female students. Source: U.S. Department of Education, National Center for Education Statistics(superscript to full citation)
Question 2: Shifts in Female PhD Recipients by Field (1985,2000,2015)
#Describe if and how there was a shift in PhDs awarded to females in four fields (Physical and Earth Sciences, Engineering, Education, and Humanities & Arts) in 1985, 2000, and 2015. Describe your results statistically, in a graph or table, and in text.
#Tried to make a dataframe to graph the trends.. this was attempt 1:
phd_female_fields <- phd_field_df %>%
select("Field of study and sex", "Gender", "Number 1985", "Percent 1985", "Number 2000", "Percent 2000", "Number 2015", "Percent 2015") %>%
filter(Gender == "Female", `Field of study and sex` == "Physical sciences and earth sciences" | `Field of study and sex` == "Engineering" | `Field of study and sex` == "Education" | `Field of study and sex` == "Humanities and arts")
#but realized it was in a terrible format that R couldn't deal with so went back into excel and made a new one
#Attempt 2:
#reassign year to be a character
female_phd_tidy$Year <- as.character(female_phd_tidy$Year)
phd_female_number_graph <- ggplot(female_phd_tidy, aes(x = Year, y = `Number Enrolled`, group = `Field of study and sex`)) +
geom_col(position = "dodge", aes(fill = `Field of study and sex`)) +
theme_classic() +
scale_y_continuous(expand = c(0,0)) +
scale_fill_brewer(palette = "Pastel1", name = "Field of Study") +
labs(title = "Number of Female PhD Enrollments by Field (1985-2015)")
phd_female_number_graph
#Does this make sense, or is it better to group by year???
#I can't get the colors to change :(
#ok making another one that's based on percents
phd_female_percent_graph <- ggplot(female_phd_tidy, aes(x = Year, y = `Percent Enrolled`, group = `Field of study and sex`)) +
geom_col(position = "dodge", aes(fill = `Field of study and sex`)) +
theme_linedraw() +
labs(title = "Percent of Female PhD Enrollments by Field (1985-2015)") +
scale_fill_brewer(palette = "Pastel1", name = "Field of Study")
phd_female_percent_graph
#proptables and chi square
row.names(female_numbers_tidy) <- female_numbers_tidy$`Field of study and sex`
## Warning: Setting row names on a tibble is deprecated.
female_prop_final <- female_numbers_tidy %>%
select(`1985`, `2000`, `2015`)
phd_chi <- chisq.test(female_prop_final)
phd_chi
##
## Pearson's Chi-squared test
##
## data: female_prop_final
## X-squared = 2073.3, df = 6, p-value < 2.2e-16
#Ok proportions are significantly different, p < 2.2e-16
1985 2000 2015
Physical sciences and earth sciences -10.376557 -8.559263 17.031216 Engineering -24.746239 -12.702359 33.184446 Education 29.337381 7.576085 -32.127993 Humanities and arts -5.666481 7.947822 -2.866323
Question 3: Male and Female Salaries for Starting PostDoc & Other Employment Positions
# Running paired t-tests for the median salaries of male and females
# Regular employment
# It doesn't work! need to do mann whitney U/wilcox
phd_med_sal_tidy_df
## # A tibble: 60 x 7
## Field Median.Salary Sex Planas X5 X6 X7
## <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 Agricultural sciences a… 78000 FEMALE EMPLOY… <NA> <NA> <NA>
## 2 Biological and biomedic… 75000 FEMALE EMPLOY… <NA> <NA> <NA>
## 3 Health sciences 75000 FEMALE EMPLOY… <NA> <NA> <NA>
## 4 Chemistry 80000 FEMALE EMPLOY… <NA> <NA> <NA>
## 5 Geosciences, atmospheri… 75167 FEMALE EMPLOY… <NA> <NA> <NA>
## 6 Physics and astronomy 95000 FEMALE EMPLOY… <NA> <NA> <NA>
## 7 Mathematics and compute… 105000 FEMALE EMPLOY… <NA> <NA> <NA>
## 8 Psychology 63000 FEMALE EMPLOY… <NA> <NA> <NA>
## 9 Economics 105000 FEMALE EMPLOY… <NA> <NA> <NA>
## 10 Social sciences 64000 FEMALE EMPLOY… <NA> <NA> <NA>
## # ... with 50 more rows
# Question 3
# Exploratory Analysis
# Make histogram of median salary vs sex, Employment
employment_test_df <- phd_med_sal_tidy_df %>%
filter(Planas == "EMPLOYMENT")
employment_test_hist <- ggplot(employment_test_df, aes(x = Median.Salary )) +
geom_histogram(bins = 6, aes(fill = Sex)) +
facet_wrap(~ Sex, scale = "free") +
theme_classic() +
theme(legend.position = "none") +
scale_y_continuous(expand = c(0,0))
employment_test_hist
# Make histogram of median salary vs sex, Employment
postdoc_test_df <- phd_med_sal_tidy_df %>%
filter(Planas == "POSTDOC")
postdoc_test_hist <- ggplot(postdoc_test_df, aes(x = Median.Salary )) +
geom_histogram(bins = 6, aes(fill = Sex)) +
facet_wrap(~ Sex, scale = "free") +
theme_classic() +
theme(legend.position = "none") +
scale_y_continuous(expand = c(0,0)) +
scale_fill_brewer(palette = "Pastel1")
postdoc_test_hist
# didn't work
med_emp_male <- c(78000, 75000, 75000, 80000, 75167, 95000, 105000, 63000, 105000, 64000, 95000, 71000, 52000, 123500, 62800)
med_emp_fem <- c(66000, 66000, 75000, 75000, 71750, 97650, 90000, 60000, 95750, 62000, 90000, 63000, 50000, 120000, 61000)
employment_wilcox <- wilcox.test(med_emp_male, med_emp_fem, exact = FALSE, paired = TRUE)
employment_wilcox
##
## Wilcoxon signed rank test with continuity correction
##
## data: med_emp_male and med_emp_fem
## V = 101, p-value = 0.002572
## alternative hypothesis: true location shift is not equal to 0
med_post_male <- c(42750, 42000, 43000, 42000, 50000, 50000, 58000, 42000, 65000, 48000, 45000, 50000, 45000, 60000, 50000)
med_post_fem <- c(44000, 42000, 43250, 42000, 50000, 53000, 55000, 42000, 65000, 49250, 45000, 45000, 45000, 63500, 44000)
postdoc_wilcox <- wilcox.test(med_post_male, med_post_fem, exact = FALSE, paired = TRUE)
postdoc_wilcox
##
## Wilcoxon signed rank test with continuity correction
##
## data: med_post_male and med_post_fem
## V = 19.5, p-value = 0.8884
## alternative hypothesis: true location shift is not equal to 0
# Column Graphs for Visualization
phd_med_sal_tidy_df2 <- phd_med_sal_tidy_df %>%
mutate(sex = ifelse(Sex == "FEMALE", "Male", "Female")) %>%
select(Field, Median.Salary, Planas, sex)
phd_med_tidy_employ <- phd_med_sal_tidy_df2 %>%
filter(Planas == "EMPLOYMENT")
employ_graph <- ggplot(phd_med_tidy_employ, aes(x = Field, y = Median.Salary)) +
geom_col(position = "dodge", aes(fill = sex)) +
theme_linedraw() +
coord_flip() +
scale_y_continuous(expand = c(0,0), breaks = seq(0, 125000, 25000), limits = c(0,125000)) +
labs(title = "Median Salaries in non-PostDoc Positions", x = "Field", y = "Median Salary ($)") + scale_fill_brewer(palette = "Pastel1", name = "Sex")
employ_graph
phd_med_tidy_post <- phd_med_sal_tidy_df2 %>%
filter(Planas == "POSTDOC")
postdoc_graph <- ggplot(phd_med_tidy_post, aes(x = Field, y = Median.Salary)) +
geom_col(position = "dodge", aes(fill = sex)) +
theme_linedraw() +
coord_flip() +
scale_y_continuous(expand = c(0,0), breaks = seq(0, 80000, 20000), limits = c(0,80000)) +
labs(title = "Median Salaries in PostDoc Positions", x = "Field", y = "Median Salary ($)") + scale_fill_brewer(palette = "Pastel1", name = "Sex")
postdoc_graph
Question 4: Exploring Academic Salaries for Professors in U.S. Colleges
# Create Histogram Years Since PhD vs Salary
years_since_phd_gph <- ggplot(fac_sal_df, aes(x = Years.Since.PhD, y = Salary)) +
geom_point(aes(color = Sex), alpha = 0.5) +
facet_wrap(~ Faculty.Rank) +
scale_fill_brewer(palette = "Pastel1")
years_since_phd_gph
#Linear regession exploration
pairs_df <- fac_sal_df %>%
select(Years.Since.PhD, Years.Faculty.Service, Salary)
pairs(pairs_df)
#Salary ad years since PHD looks more linear thean Years.faculty.service
salary_lm <- lm(Salary ~ Sex + Years.Faculty.Service + Years.Since.PhD + Discipline + Faculty.Rank, data = fac_sal_df)
salary_lm
##
## Call:
## lm(formula = Salary ~ Sex + Years.Faculty.Service + Years.Since.PhD +
## Discipline + Faculty.Rank, data = fac_sal_df)
##
## Coefficients:
## (Intercept) SexMale Years.Faculty.Service
## 78862.8 4783.5 -489.5
## Years.Since.PhD DisciplineB Faculty.RankAsstProf
## 535.1 14417.6 -12907.6
## Faculty.RankProf
## 32158.4
# Set rank reference level
fac_sal_df$Faculty.Rank <- fct_relevel(fac_sal_df$Faculty.Rank, "AsstProf")
# b. Model summary:
summary(salary_lm)
##
## Call:
## lm(formula = Salary ~ Sex + Years.Faculty.Service + Years.Since.PhD +
## Discipline + Faculty.Rank, data = fac_sal_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65248 -13211 -1775 10384 99592
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78862.8 4990.3 15.803 < 2e-16 ***
## SexMale 4783.5 3858.7 1.240 0.21584
## Years.Faculty.Service -489.5 211.9 -2.310 0.02143 *
## Years.Since.PhD 535.1 241.0 2.220 0.02698 *
## DisciplineB 14417.6 2342.9 6.154 1.88e-09 ***
## Faculty.RankAsstProf -12907.6 4145.3 -3.114 0.00198 **
## Faculty.RankProf 32158.4 3540.6 9.083 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22540 on 390 degrees of freedom
## Multiple R-squared: 0.4547, Adjusted R-squared: 0.4463
## F-statistic: 54.2 on 6 and 390 DF, p-value: < 2.2e-16
AIC(salary_lm)
## [1] 9093.826
# AIC = 9093.826
#Visualize residuals
plot(salary_lm)
vif(salary_lm)
## GVIF Df GVIF^(1/(2*Df))
## Sex 1.030805 1 1.015285
## Years.Faculty.Service 5.923038 1 2.433729
## Years.Since.PhD 7.518936 1 2.742068
## Discipline 1.064105 1 1.031555
## Faculty.Rank 2.013193 2 1.191163
#ok yikes so years of faculty service and years since PhD are super correlated.
#I feel like basically we only use one of those I feel like that's fine since most people will work right after their phd, doesn't make sense to include both.
salary_lm2 <- lm(Salary ~ Sex + Years.Since.PhD + Discipline + Faculty.Rank, data = fac_sal_df)
salary_lm2
##
## Call:
## lm(formula = Salary ~ Sex + Years.Since.PhD + Discipline + Faculty.Rank,
## data = fac_sal_df)
##
## Coefficients:
## (Intercept) SexMale Years.Since.PhD
## 67884.32 4349.37 61.01
## DisciplineB Faculty.RankAssocProf Faculty.RankProf
## 13937.47 13104.15 46032.55
summary(salary_lm2)
##
## Call:
## lm(formula = Salary ~ Sex + Years.Since.PhD + Discipline + Faculty.Rank,
## data = fac_sal_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67451 -13860 -1549 10716 97023
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67884.32 4536.89 14.963 < 2e-16 ***
## SexMale 4349.37 3875.39 1.122 0.26242
## Years.Since.PhD 61.01 127.01 0.480 0.63124
## DisciplineB 13937.47 2346.53 5.940 6.32e-09 ***
## Faculty.RankAssocProf 13104.15 4167.31 3.145 0.00179 **
## Faculty.RankProf 46032.55 4240.12 10.856 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22660 on 391 degrees of freedom
## Multiple R-squared: 0.4472, Adjusted R-squared: 0.4401
## F-statistic: 63.27 on 5 and 391 DF, p-value: < 2.2e-16
AIC(salary_lm2)
## [1] 9097.22
#AIC = 9097.22
plot(salary_lm2)
#resdiuals are bad, but the qq plot looks good.
vif(salary_lm2)
## GVIF Df GVIF^(1/(2*Df))
## Sex 1.028359 1 1.014080
## Years.Since.PhD 2.065517 1 1.437191
## Discipline 1.055727 1 1.027486
## Faculty.Rank 1.987205 2 1.187301
#woooo all are under 4 so we are good to go re: colinearlity
#my prediction is that male/female doesn't change the slope of the line, they just have differne t starting point, so we need that interation term to make up for that?
#I will graph it to see if it's visual
#ok no that predcition was wrong. Sex doesn't appear to be that significant, there's just way more males than females, and way more old males.
#For exploratory purposes, what about not including sex?
salary_lm3 <- lm(Salary ~ Years.Since.PhD + Discipline + Faculty.Rank, data = fac_sal_df)
salary_lm3
##
## Call:
## lm(formula = Salary ~ Years.Since.PhD + Discipline + Faculty.Rank,
## data = fac_sal_df)
##
## Coefficients:
## (Intercept) Years.Since.PhD DisciplineB
## 71405.40 71.92 14028.68
## Faculty.RankAssocProf Faculty.RankProf
## 13030.16 46211.57
AIC(salary_lm3)
## [1] 9096.497
#AIC = 9096.497
plot(salary_lm3)
vif(salary_lm3)
## GVIF Df GVIF^(1/(2*Df))
## Years.Since.PhD 2.053425 1 1.432978
## Discipline 1.054461 1 1.026869
## Faculty.Rank 1.978933 2 1.186063
#For exploratory purposes, maybe remove faculty rank, since that goes along with years since PHD?
salary_lm4 <- lm(Salary ~ Years.Since.PhD + Discipline, data = fac_sal_df)
salary_lm4
##
## Call:
## lm(formula = Salary ~ Years.Since.PhD + Discipline, data = fac_sal_df)
##
## Coefficients:
## (Intercept) Years.Since.PhD DisciplineB
## 80158 1119 15784
AIC(salary_lm4)
## [1] 9217.566
#AIC = 9217.566
plot(salary_lm4)
vif(salary_lm4)
## Years.Since.PhD Discipline
## 1.049937 1.049937
#For exploratory purposes, maybe remove years, since that goes along with years since Rank? Also putting sex back in.
salary_lm5 <- lm(Salary ~ Faculty.Rank + Discipline + Sex, data = fac_sal_df)
salary_lm5
##
## Call:
## lm(formula = Salary ~ Faculty.Rank + Discipline + Sex, data = fac_sal_df)
##
## Coefficients:
## (Intercept) Faculty.RankAssocProf Faculty.RankProf
## 68224 13723 47403
## DisciplineB SexMale
## 13709 4492
AIC(salary_lm5)
## [1] 9095.454
#AIC = 9095.454
plot(salary_lm5)
vif(salary_lm5)
## GVIF Df GVIF^(1/(2*Df))
## Faculty.Rank 1.034437 2 1.008500
## Discipline 1.012236 1 1.006099
## Sex 1.022339 1 1.011108
gender_dif_graph <- ggplot(fac_sal_df, aes(x = Years.Since.PhD, y = Salary, group = Sex)) +
geom_point(aes(color = Faculty.Rank, shape = Discipline)) +
facet_wrap(~Sex) +
scale_colour_brewer(palette = "Pastel1") +
theme_bw()
gender_dif_graph
regression_table <- stargazer(salary_lm5, type = "html")
| Dependent variable: | |
| Salary | |
| Faculty.RankAssocProf | 13,723.420*** |
| (3,959.014) | |
| Faculty.RankProf | 47,403.320*** |
| (3,133.108) | |
| DisciplineB | 13,708.690*** |
| (2,295.437) | |
| SexMale | 4,491.801 |
| (3,860.240) | |
| Constant | 68,223.530*** |
| (4,477.200) | |
| Observations | 397 |
| R2 | 0.447 |
| Adjusted R2 | 0.441 |
| Residual Std. Error | 22,640.990 (df = 392) |
| F Statistic | 79.180*** (df = 4; 392) |
| Note: | p<0.1; p<0.05; p<0.01 |